En primer lugar importamos las librerías necesarias.
from glob import glob
from pathlib import Path
from os import listdir, stat
from shutil import copyfile
import os
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
import numpy as np
Definimos una función para mostrar las imágenes del conjunto de datos.
def init_fig(height=15, width=15):
fig = plt.figure()
fig.set_figheight(height)
fig.set_figwidth(width)
return fig
Definimos las rutas de los datos, pesos y archivos de destino.
SCRAPER_FOLDER = Path('/notebooks/scraper')
CSV_FOLDER = Path('/notebooks/csv/')
DATA_PATH = Path(SCRAPER_FOLDER/'inaturalist/inaturalist.csv')
CSV_PATH = Path(SCRAPER_FOLDER/'families_3.csv')
ALL_CSV_PATH = Path(SCRAPER_FOLDER/'species1000.csv')
CLEAN_CSV_PATH = Path(CSV_FOLDER/'species1000-df-filtered.csv')
CLEAN_SAMPLE_CSV_PATH = Path(CSV_FOLDER/'families_3.csv')
STRATIFIED_CSV_PATH = Path(CSV_FOLDER/'species1000-stratified.csv')
STRATIFIED_SAMPLE_CSV_PATH = Path(CSV_FOLDER/'families3-stratified.csv')
WEIGHTS_CSV_PATH = Path(CSV_FOLDER/'species1000-weights.csv')
WEIGHTS_SAMPLE_CSV_PATH = Path(CSV_FOLDER/'families3-weights.csv')
WEIGHTS_FAMILIES_CSV_PATH = Path(CSV_FOLDER/'species1000-weights-family.csv')
WEIGHTS_SAMPLE_FAMILIES_CSV_PATH = Path(CSV_FOLDER/'families3-weights-family.csv')
WEIGHTS_GENUS_CSV_PATH = Path(CSV_FOLDER/'species1000-weights-genus.csv')
WEIGHTS_SAMPLE_GENUS_CSV_PATH = Path(CSV_FOLDER/'families3-weights-genus.csv')
WEIGHTS_ORDER_CSV_PATH = Path(CSV_FOLDER/'species1000-weights-order.csv')
WEIGHTS_SAMPLE_ORDER_CSV_PATH = Path(CSV_FOLDER/'families3-weights-order.csv')
Cargamos nuestro conjunto de datos obtenido de la plataforma iNaturalist con las categorías a diferentes niveles de especificación, número de observaciones y URLs por especie.
df = pd.read_csv(DATA_PATH)
df.head()
| Order | Family | Subfamily | Genus | Specie | Observations | url | |
|---|---|---|---|---|---|---|---|
| 0 | Acipenseriformes | Acipenseridae | Acipenserinae | Acipenser | Acipenser baerii | 4 | https://www.inaturalist.org/taxa/93179-Acipens... |
| 1 | Acipenseriformes | Acipenseridae | Acipenserinae | Acipenser | Acipenser brevirostrum | 11 | https://www.inaturalist.org/taxa/49823-Acipens... |
| 2 | Acipenseriformes | Acipenseridae | Acipenserinae | Acipenser | Acipenser dabryanus | 0 | https://www.inaturalist.org/taxa/93180-Acipens... |
| 3 | Acipenseriformes | Acipenseridae | Acipenserinae | Acipenser | Acipenser fulvescens | 380 | https://www.inaturalist.org/taxa/93181-Acipens... |
| 4 | Acipenseriformes | Acipenseridae | Acipenserinae | Acipenser | Acipenser gueldenstaedtii | 7 | https://www.inaturalist.org/taxa/93182-Acipens... |
Definimos una función para ordenar de forma descentente los datos por cantidad de observaciones.
def desc_group_by(df, key):
return df.groupby([key]).sum().sort_values(by=['Observations'], ascending=False)
Mostramos las categorías Orden de forma descentente.
orders = desc_group_by(df, 'Order')
print('Size:', orders.size)
orders
Size: 46
| Observations | |
|---|---|
| Order | |
| Perciformes | 275755 |
| Cypriniformes | 43970 |
| Tetraodontiformes | 31372 |
| Scorpaeniformes | 21186 |
| Salmoniformes | 14681 |
| Syngnathiformes | 14595 |
| Cyprinodontiformes | 14095 |
| Anguilliformes | 11023 |
| Siluriformes | 10999 |
| Pleuronectiformes | 4200 |
| Esociformes | 4143 |
| Clupeiformes | 3987 |
| Beryciformes | 3140 |
| Lepisosteiformes | 2946 |
| Gasterosteiformes | 2426 |
| Aulopiformes | 2209 |
| Beloniformes | 2126 |
| Atheriniformes | 1847 |
| Characiformes | 1840 |
| Lophiiformes | 1786 |
| Gadiformes | 1562 |
| Gobiesociformes | 1550 |
| Osmeriformes | 1361 |
| Batrachoidiformes | 1117 |
| Elopiformes | 942 |
| Acipenseriformes | 747 |
| Amiiformes | 585 |
| Osteoglossiformes | 529 |
| Synbranchiformes | 369 |
| Percopsiformes | 345 |
| Ophidiiformes | 155 |
| Gymnotiformes | 117 |
| Zeiformes | 113 |
| Albuliformes | 109 |
| Gonorynchiformes | 108 |
| Stomiiformes | 88 |
| Polypteriformes | 70 |
| Lampriformes | 69 |
| Myctophiformes | 38 |
| Mugiliformes | 14 |
| Cetomimiformes | 5 |
| Notacanthiformes | 5 |
| Saccopharyngiformes | 5 |
| Polymixiiformes | 4 |
| Stephanoberyciformes | 3 |
| Ateleopodiformes | 2 |
Mostramos las categorías Familia de forma descentente.
families = desc_group_by(df, 'Family')
print('Size:', families.size)
families.head(10)
Size: 490
| Observations | |
|---|---|
| Family | |
| Centrarchidae | 39237 |
| Cyprinidae | 38766 |
| Pomacentridae | 26312 |
| Labridae | 19882 |
| Chaetodontidae | 15153 |
| Salmonidae | 14681 |
| Serranidae | 14195 |
| Gobiidae | 13500 |
| Acanthuridae | 12559 |
| Syngnathidae | 11097 |
Mostramos las categorías Subfamilia de forma descentente.
subfamilies = desc_group_by(df, 'Subfamily')
print('Size:', subfamilies.size)
subfamilies.head(10)
Size: 302
| Observations | |
|---|---|
| Subfamily | |
| Pomacentrinae | 22178 |
| Leuciscinae | 20404 |
| Salmoninae | 14108 |
| Corinae | 14002 |
| Cyprininae | 12769 |
| Gobiinae | 10255 |
| Acanthurinae | 9645 |
| Poeciliinae | 8641 |
| Epinephelinae | 8188 |
| Muraeninae | 7035 |
Mostramos las categorías Orden de forma descentente.
genera = desc_group_by(df, 'Genus')
print('Size:', genera.size)
genera.head(10)
Size: 4511
| Observations | |
|---|---|
| Genus | |
| Lepomis | 22046 |
| Micropterus | 12970 |
| Chaetodon | 11725 |
| Cyprinus | 8812 |
| Oncorhynchus | 7957 |
| Acanthurus | 6824 |
| Hippocampus | 6503 |
| Lutjanus | 5873 |
| Gambusia | 5697 |
| Gymnothorax | 5288 |
Mostramos las categorías Especie de forma descentente.
species = desc_group_by(df, 'Specie')
print('Size:', species.size)
species.head(10)
Size: 30712
| Observations | |
|---|---|
| Specie | |
| Lepomis macrochirus | 9119 |
| Micropterus salmoides | 8615 |
| Cyprinus carpio | 5299 |
| Oncorhynchus mykiss | 4199 |
| Gambusia affinis | 3790 |
| Cyprinus rubrofuscus | 3511 |
| Lepomis cyanellus | 3375 |
| Carassius auratus | 3033 |
| Lepomis gibbosus | 2969 |
| Micropterus dolomieu | 2933 |
Seleccionamos las 1000 especies con más observaciones para formar el conjunto de datos completo.
df_1000 = df.sort_values(by=['Observations'], ascending=False)[:1000]
df_1000
| Order | Family | Subfamily | Genus | Specie | Observations | url | |
|---|---|---|---|---|---|---|---|
| 13670 | Perciformes | Centrarchidae | NaN | Lepomis | Lepomis macrochirus | 9119 | https://www.inaturalist.org/taxa/49591-Lepomis... |
| 13688 | Perciformes | Centrarchidae | NaN | Micropterus | Micropterus salmoides | 8615 | https://www.inaturalist.org/taxa/49587-Micropt... |
| 5433 | Cypriniformes | Cyprinidae | Cyprininae | Cyprinus | Cyprinus carpio | 5299 | https://www.inaturalist.org/taxa/53911-Cyprinu... |
| 23808 | Salmoniformes | Salmonidae | Salmoninae | Oncorhynchus | Oncorhynchus mykiss | 4199 | https://www.inaturalist.org/taxa/47516-Oncorhy... |
| 8806 | Cyprinodontiformes | Poeciliidae | Poeciliinae | Gambusia | Gambusia affinis | 3790 | https://www.inaturalist.org/taxa/59115-Gambusi... |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 24336 | Scorpaeniformes | Cyclopteridae | NaN | Cyclopterus | Cyclopterus lumpus | 107 | https://www.inaturalist.org/taxa/60603-Cyclopt... |
| 21655 | Perciformes | Serranidae | Epinephelinae | Cephalopholis | Cephalopholis cyanostigma | 107 | https://www.inaturalist.org/taxa/96992-Cephalo... |
| 4911 | Cypriniformes | Cyprinidae | Barbinae | Barbus | Barbus barbus | 107 | https://www.inaturalist.org/taxa/95147-Barbus-... |
| 10388 | Gobiesociformes | Gobiesocidae | Gobiesocinae | Trachelochismus | Trachelochismus pinnulatus | 107 | https://www.inaturalist.org/taxa/474797-Trache... |
| 15611 | Perciformes | Cichlidae | Pseudocrenilabrinae | Sarotherodon | Sarotherodon melanotheron | 107 | https://www.inaturalist.org/taxa/230431-Saroth... |
1000 rows × 7 columns
Observamos el número de clases por categoría jerárquica.
df_1000.describe(include='all')
| Order | Family | Subfamily | Genus | Specie | Observations | url | |
|---|---|---|---|---|---|---|---|
| count | 1000 | 1000 | 632 | 1000 | 1000 | 1000.000000 | 1000 |
| unique | 29 | 135 | 101 | 454 | 1000 | NaN | 1000 |
| top | Perciformes | Pomacentridae | Pomacentrinae | Chaetodon | Oblada melanura | NaN | https://www.inaturalist.org/taxa/113981-Thalas... |
| freq | 609 | 67 | 57 | 33 | 1 | NaN | 1 |
| mean | NaN | NaN | NaN | NaN | NaN | 362.427000 | NaN |
| std | NaN | NaN | NaN | NaN | NaN | 565.202911 | NaN |
| min | NaN | NaN | NaN | NaN | NaN | 107.000000 | NaN |
| 25% | NaN | NaN | NaN | NaN | NaN | 149.750000 | NaN |
| 50% | NaN | NaN | NaN | NaN | NaN | 225.000000 | NaN |
| 75% | NaN | NaN | NaN | NaN | NaN | 395.500000 | NaN |
| max | NaN | NaN | NaN | NaN | NaN | 9119.000000 | NaN |
Seleccionamos las tres familias con más observaciones para conformar el conjunto de datos reducido.
limit = 3
selected_families = list(families[:limit].index)
selected_families
['Centrarchidae', 'Cyprinidae', 'Pomacentridae']
Seleccionamos las especies de nuestro conjunto de datos que corresponden a estas familias seleccionadas.
df_selected = df_1000[df_1000['Family'].isin(selected_families)]
df_selected
| Order | Family | Subfamily | Genus | Specie | Observations | url | |
|---|---|---|---|---|---|---|---|
| 13670 | Perciformes | Centrarchidae | NaN | Lepomis | Lepomis macrochirus | 9119 | https://www.inaturalist.org/taxa/49591-Lepomis... |
| 13688 | Perciformes | Centrarchidae | NaN | Micropterus | Micropterus salmoides | 8615 | https://www.inaturalist.org/taxa/49587-Micropt... |
| 5433 | Cypriniformes | Cyprinidae | Cyprininae | Cyprinus | Cyprinus carpio | 5299 | https://www.inaturalist.org/taxa/53911-Cyprinu... |
| 5452 | Cypriniformes | Cyprinidae | Cyprininae | Cyprinus | Cyprinus rubrofuscus | 3511 | https://www.inaturalist.org/taxa/187316-Cyprin... |
| 13666 | Perciformes | Centrarchidae | NaN | Lepomis | Lepomis cyanellus | 3375 | https://www.inaturalist.org/taxa/58636-Lepomis... |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 20574 | Perciformes | Pomacentridae | Pomacentrinae | Chrysiptera | Chrysiptera taupou | 111 | https://www.inaturalist.org/taxa/181017-Chrysi... |
| 20759 | Perciformes | Pomacentridae | Pomacentrinae | Stegastes | Stegastes marginatus | 111 | https://www.inaturalist.org/taxa/154336-Stegas... |
| 20720 | Perciformes | Pomacentridae | Pomacentrinae | Pomacentrus | Pomacentrus sulfureus | 108 | https://www.inaturalist.org/taxa/119310-Pomace... |
| 20428 | Perciformes | Pomacentridae | Pomacentrinae | Amblyglyphidodon | Amblyglyphidodon leucogaster | 108 | https://www.inaturalist.org/taxa/181015-Amblyg... |
| 4911 | Cypriniformes | Cyprinidae | Barbinae | Barbus | Barbus barbus | 107 | https://www.inaturalist.org/taxa/95147-Barbus-... |
144 rows × 7 columns
Observamos el número de clases por categoría jerárquica.
df_selected.plot(x='Family', y='Observations', kind='bar', figsize=(30,5))
<AxesSubplot:xlabel='Family'>
df_selected.describe(include = 'object')
| Order | Family | Subfamily | Genus | Specie | url | |
|---|---|---|---|---|---|---|
| count | 144 | 144 | 121 | 144 | 144 | 144 |
| unique | 2 | 3 | 11 | 56 | 144 | 144 |
| top | Perciformes | Pomacentridae | Pomacentrinae | Lepomis | Scardinius erythrophthalmus | https://www.inaturalist.org/taxa/509791-Parma-... |
| freq | 90 | 67 | 57 | 12 | 1 | 1 |
Observamos un considerable desbalanceo, por lo que tendremos que tenerlo en cuenta posteriormente.
Buscamos valores perdidos.
def count_nulls(df, column):
return df[column].isnull().sum()
print(f"Order contains {count_nulls(df_selected, 'Order')} nan values.")
print(f"Family contains {count_nulls(df_selected, 'Family')} nan values.")
print(f"Subfamily contains {count_nulls(df_selected, 'Subfamily')} nan values.")
print(f"Genus contains {count_nulls(df_selected, 'Genus')} nan values.")
print(f"Specie contains {count_nulls(df_selected, 'Specie')} nan values.")
Order contains 0 nan values. Family contains 0 nan values. Subfamily contains 23 nan values. Genus contains 0 nan values. Specie contains 0 nan values.
df_nan_selected = df_selected[df_selected['Subfamily'].isnull()]
df_nan_selected
| Order | Family | Subfamily | Genus | Specie | Observations | url | |
|---|---|---|---|---|---|---|---|
| 13670 | Perciformes | Centrarchidae | NaN | Lepomis | Lepomis macrochirus | 9119 | https://www.inaturalist.org/taxa/49591-Lepomis... |
| 13688 | Perciformes | Centrarchidae | NaN | Micropterus | Micropterus salmoides | 8615 | https://www.inaturalist.org/taxa/49587-Micropt... |
| 13666 | Perciformes | Centrarchidae | NaN | Lepomis | Lepomis cyanellus | 3375 | https://www.inaturalist.org/taxa/58636-Lepomis... |
| 13667 | Perciformes | Centrarchidae | NaN | Lepomis | Lepomis gibbosus | 2969 | https://www.inaturalist.org/taxa/49614-Lepomis... |
| 13682 | Perciformes | Centrarchidae | NaN | Micropterus | Micropterus dolomieu | 2933 | https://www.inaturalist.org/taxa/49590-Micropt... |
| 13672 | Perciformes | Centrarchidae | NaN | Lepomis | Lepomis megalotis | 2045 | https://www.inaturalist.org/taxa/58635-Lepomis... |
| 13665 | Perciformes | Centrarchidae | NaN | Lepomis | Lepomis auritus | 1628 | https://www.inaturalist.org/taxa/85365-Lepomis... |
| 13693 | Perciformes | Centrarchidae | NaN | Pomoxis | Pomoxis nigromaculatus | 1521 | https://www.inaturalist.org/taxa/49594-Pomoxis... |
| 13659 | Perciformes | Centrarchidae | NaN | Ambloplites | Ambloplites rupestris | 1475 | https://www.inaturalist.org/taxa/58637-Amblopl... |
| 13668 | Perciformes | Centrarchidae | NaN | Lepomis | Lepomis gulosus | 922 | https://www.inaturalist.org/taxa/104252-Lepomi... |
| 13673 | Perciformes | Centrarchidae | NaN | Lepomis | Lepomis microlophus | 898 | https://www.inaturalist.org/taxa/85127-Lepomis... |
| 13692 | Perciformes | Centrarchidae | NaN | Pomoxis | Pomoxis annularis | 600 | https://www.inaturalist.org/taxa/49593-Pomoxis... |
| 13687 | Perciformes | Centrarchidae | NaN | Micropterus | Micropterus punctulatus | 390 | https://www.inaturalist.org/taxa/105821-Microp... |
| 13683 | Perciformes | Centrarchidae | NaN | Micropterus | Micropterus floridanus | 311 | https://www.inaturalist.org/taxa/355459-Microp... |
| 13690 | Perciformes | Centrarchidae | NaN | Micropterus | Micropterus treculii | 271 | https://www.inaturalist.org/taxa/105822-Microp... |
| 13671 | Perciformes | Centrarchidae | NaN | Lepomis | Lepomis marginatus | 260 | https://www.inaturalist.org/taxa/104254-Lepomi... |
| 13661 | Perciformes | Centrarchidae | NaN | Centrarchus | Centrarchus macropterus | 258 | https://www.inaturalist.org/taxa/96911-Centrar... |
| 13676 | Perciformes | Centrarchidae | NaN | Lepomis | Lepomis punctatus | 251 | https://www.inaturalist.org/taxa/104256-Lepomi... |
| 13669 | Perciformes | Centrarchidae | NaN | Lepomis | Lepomis humilis | 201 | https://www.inaturalist.org/taxa/104253-Lepomi... |
| 13674 | Perciformes | Centrarchidae | NaN | Lepomis | Lepomis miniatus | 198 | https://www.inaturalist.org/taxa/104255-Lepomi... |
| 13663 | Perciformes | Centrarchidae | NaN | Enneacanthus | Enneacanthus gloriosus | 119 | https://www.inaturalist.org/taxa/99992-Enneaca... |
| 13675 | Perciformes | Centrarchidae | NaN | Lepomis | Lepomis peltastes | 118 | https://www.inaturalist.org/taxa/358056-Lepomi... |
| 13685 | Perciformes | Centrarchidae | NaN | Micropterus | Micropterus henshalli | 115 | https://www.inaturalist.org/taxa/225143-Microp... |
Podemos observar que solo existen valores perdidos en la categoría Subfamilia, sin embargo, esto se debe a que no todas las especies presentan este nivel de especificación, por lo que no se debe considerar como valor perdido, pero tampoco se usará para predecir.
Guardamos estos conjuntos seleccionados.
df_selected.to_csv(CSV_PATH, index=False)
df_1000.to_csv(ALL_CSV_PATH, index=False)
Una vez descargadas las imágenes con las URLs del conjunto de datos seleccionado procedemos a comprobar si existen imágenes corruptas y eliminar de este conjunto.
from fastai.vision.all import *
df_path = Path('/root/Documents/')
df = pd.read_csv(df_path/"species1000-df.csv")
path = Path('/root/Documents/images')
fns = get_image_files(path)
failed = verify_images(fns)
failed2 = [str(f).replace(str(path)+'/', '') for f in failed]
filtered_df = df[~df['fname'].isin(failed2)]
filtered_df.to_csv(df_path/'species1000-df-filtered.csv', index=False)
Como vimos anteriormente, existe un desbalanceo considerable entre clases, por lo que trataremos estre desbalanceo con particiones estratificadas y pesos de importancia por clase.
Cargamos el conjunto de datos completo.
df_images_1000 = pd.read_csv(CLEAN_CSV_PATH)
Creamos las particiones estratificadas con la función _train_test_split_ de Scikit-Learn.
train, test, _, _ = train_test_split(df_images_1000, df_images_1000['Specie'],
stratify=df_images_1000['Specie'],
test_size=0.2, random_state=42)
train['is_valid'] = False
test['is_valid'] = True
<ipython-input-25-5c5634737421>:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy train['is_valid'] = False <ipython-input-25-5c5634737421>:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy test['is_valid'] = True
df_1000_stratified = pd.concat([train, test])
Guardamos esta separación de particiones estratificadas en un nuevo CSV.
df_1000_stratified.to_csv(STRATIFIED_CSV_PATH, index=False)
df_1000_stratified = pd.read_csv(STRATIFIED_CSV_PATH)
df_1000_stratified
| Order | Family | Subfamily | Genus | Specie | fname | is_valid | |
|---|---|---|---|---|---|---|---|
| 0 | Perciformes | Centrarchidae | NaN | Lepomis | Lepomis gibbosus | Lepomis gibbosus/1879321.jpg | False |
| 1 | Perciformes | Centrarchidae | NaN | Pomoxis | Pomoxis nigromaculatus | Pomoxis nigromaculatus/61056925.jpg | False |
| 2 | Perciformes | Centrarchidae | NaN | Lepomis | Lepomis miniatus | Lepomis miniatus/36578909.jpg | False |
| 3 | Cypriniformes | Cyprinidae | Leuciscinae | Scardinius | Scardinius erythrophthalmus | Scardinius erythrophthalmus/77068790.jpg | False |
| 4 | Perciformes | Acanthuridae | Acanthurinae | Acanthurus | Acanthurus olivaceus | Acanthurus olivaceus/14646458.jpg | False |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 422172 | Perciformes | Acanthuridae | Acanthurinae | Acanthurus | Acanthurus coeruleus | Acanthurus coeruleus/33472582.jpg | True |
| 422173 | Perciformes | Gobiidae | Gobiinae | Amblyeleotris | Amblyeleotris steinitzi | Amblyeleotris steinitzi/3789141.jpg | True |
| 422174 | Perciformes | Percidae | Etheostomatinae | Etheostoma | Etheostoma caeruleum | Etheostoma caeruleum/40879890.jpg | True |
| 422175 | Perciformes | Kyphosidae | Microcanthinae | Tilodon | Tilodon sexfasciatus | Tilodon sexfasciatus/26825770.jpg | True |
| 422176 | Perciformes | Scaridae | Sparisomatinae | Sparisoma | Sparisoma viride | Sparisoma viride/48286026.jpg | True |
422177 rows × 7 columns
Observamos el número de clases por especie de forma gráfica.
df_value_counts = df_1000_stratified['Specie'].value_counts()
df_value_counts = df_value_counts.reset_index()
df_value_counts.columns = ['Especie', 'Número de imágenes']
df_value_counts.set_index('Especie').plot(backend='plotly', kind='area')
Observamos el número de clases por orden de forma gráfica.
df_value_counts = df_1000_stratified['Order'].value_counts()
df_value_counts = df_value_counts.reset_index()
df_value_counts.columns = ['Orden', 'Número de imágenes']
df_value_counts.set_index('Orden').plot(backend='plotly', kind='area')
Observamos el número de clases por familia de forma gráfica.
df_value_counts = df_1000_stratified['Family'].value_counts()
df_value_counts = df_value_counts.reset_index()
df_value_counts.columns = ['Familia', 'Número de imágenes']
df_value_counts.set_index('Familia').plot(backend='plotly', kind='area')
Observamos el número de clases por género de forma gráfica.
df_value_counts = df_1000_stratified['Genus'].value_counts()
df_value_counts = df_value_counts.reset_index()
df_value_counts.columns = ['Género', 'Número de imágenes']
df_value_counts.set_index('Género').plot(backend='plotly', kind='area')
Generamos los pesos de importancia por clase con la función _compute_class_weight_ de Scikit-Learn. De esta forma podremos posteriormente dar más importancia a las clases que tienen menos ejemplos en la función de pérdida y así reducir el impacto del desbalanceo durante el entrenamiento.
species_count = df_1000_stratified.groupby('Specie')['Specie'].count()
species, count_values = species_count.keys(), species_count.values
weights = compute_class_weight('balanced', classes=species, y=df_1000_stratified['Specie'])
weights_df = pd.DataFrame.from_dict({'Specie': species, 'Count': count_values, 'Weight': weights})
weights_df
| Specie | Count | Weight | |
|---|---|---|---|
| 0 | Abramis brama | 319 | 1.323439 |
| 1 | Abudefduf abdominalis | 221 | 1.910303 |
| 2 | Abudefduf bengalensis | 323 | 1.307050 |
| 3 | Abudefduf saxatilis | 1692 | 0.249514 |
| 4 | Abudefduf septemfasciatus | 149 | 2.833403 |
| ... | ... | ... | ... |
| 995 | Zanclus cornutus | 1966 | 0.214739 |
| 996 | Zebrasoma desjardinii | 166 | 2.543235 |
| 997 | Zebrasoma flavescens | 561 | 0.752544 |
| 998 | Zebrasoma scopas | 276 | 1.529627 |
| 999 | Zebrasoma velifer | 500 | 0.844354 |
1000 rows × 3 columns
weights_df.to_csv(WEIGHTS_CSV_PATH, index=False)
Generamos los pesos para el nivel de especificación familia.
families_count = df_1000_stratified.groupby('Family')['Family'].count()
families, count_values = families_count.keys(), families_count.values
weights = compute_class_weight('balanced', classes=families, y=df_1000_stratified['Family'])
weights_df = pd.DataFrame.from_dict({'Family': families, 'Count': count_values, 'Weight': weights})
weights_df
| Family | Count | Weight | |
|---|---|---|---|
| 0 | Acanthuridae | 12587 | 0.248450 |
| 1 | Achiridae | 152 | 20.573928 |
| 2 | Acipenseridae | 300 | 10.424123 |
| 3 | Amiidae | 796 | 3.928690 |
| 4 | Ammodytidae | 271 | 11.539620 |
| ... | ... | ... | ... |
| 130 | Tetrarogidae | 389 | 8.039170 |
| 131 | Trichiuridae | 212 | 14.751118 |
| 132 | Triglidae | 429 | 7.289597 |
| 133 | Umbridae | 586 | 5.336582 |
| 134 | Zanclidae | 1966 | 1.590660 |
135 rows × 3 columns
weights_df.to_csv(WEIGHTS_FAMILIES_CSV_PATH, index=False)
Generamos los pesos para el nivel de especificación género.
genus_count = df_1000_stratified.groupby('Genus')['Genus'].count()
genus, count_values = genus_count.keys(), genus_count.values
weights = compute_class_weight('balanced', classes=genus, y=df_1000_stratified['Genus'])
weights_df = pd.DataFrame.from_dict({'Genus': genus, 'Count': count_values, 'Weight': weights})
weights_df
| Genus | Count | Weight | |
|---|---|---|---|
| 0 | Abramis | 319 | 2.915064 |
| 1 | Abudefduf | 5547 | 0.167641 |
| 2 | Acanthaluteres | 246 | 3.780103 |
| 3 | Acanthistius | 179 | 5.195002 |
| 4 | Acanthochromis | 142 | 6.548629 |
| ... | ... | ... | ... |
| 449 | Variola | 189 | 4.920134 |
| 450 | Xiphister | 173 | 5.375175 |
| 451 | Xiphophorus | 668 | 1.392074 |
| 452 | Zanclus | 1966 | 0.472994 |
| 453 | Zebrasoma | 1503 | 0.618699 |
454 rows × 3 columns
weights_df.to_csv(WEIGHTS_GENUS_CSV_PATH, index=False)
Generamos los pesos para el nivel de especificación orden.
order_count = df_1000_stratified.groupby('Order')['Order'].count()
order, count_values = order_count.keys(), order_count.values
weights = compute_class_weight('balanced', classes=order, y=df_1000_stratified['Order'])
weights_df = pd.DataFrame.from_dict({'Order': order, 'Count': count_values, 'Weight': weights})
weights_df
| Order | Count | Weight | |
|---|---|---|---|
| 0 | Acipenseriformes | 300 | 48.526092 |
| 1 | Amiiformes | 796 | 18.288728 |
| 2 | Anguilliformes | 10738 | 1.355730 |
| 3 | Atheriniformes | 1166 | 12.485272 |
| 4 | Aulopiformes | 1644 | 8.855126 |
| 5 | Batrachoidiformes | 1297 | 11.224231 |
| 6 | Beloniformes | 986 | 14.764531 |
| 7 | Beryciformes | 2431 | 5.988411 |
| 8 | Characiformes | 310 | 46.960734 |
| 9 | Clupeiformes | 3120 | 4.665970 |
| 10 | Cypriniformes | 37620 | 0.386970 |
| 11 | Cyprinodontiformes | 12783 | 1.138843 |
| 12 | Elopiformes | 1147 | 12.692090 |
| 13 | Esociformes | 4540 | 3.206570 |
| 14 | Gadiformes | 519 | 28.049764 |
| 15 | Gasterosteiformes | 2952 | 4.931513 |
| 16 | Gobiesociformes | 1040 | 13.997911 |
| 17 | Lepisosteiformes | 3763 | 3.868676 |
| 18 | Lophiiformes | 1309 | 11.121335 |
| 19 | Osmeriformes | 808 | 18.017113 |
| 20 | Perciformes | 236071 | 0.061667 |
| 21 | Percopsiformes | 331 | 43.981352 |
| 22 | Pleuronectiformes | 2307 | 6.310285 |
| 23 | Salmoniformes | 16258 | 0.895425 |
| 24 | Scorpaeniformes | 16225 | 0.897247 |
| 25 | Siluriformes | 10575 | 1.376627 |
| 26 | Synbranchiformes | 225 | 64.701456 |
| 27 | Syngnathiformes | 16961 | 0.858312 |
| 28 | Tetraodontiformes | 33955 | 0.428739 |
weights_df.to_csv(WEIGHTS_ORDER_CSV_PATH, index=False)
Realizamos el mismo proceso con el conjunto de datos reducido.
df_images_families3 = pd.read_csv(CLEAN_SAMPLE_CSV_PATH)
train, test, _, _ = train_test_split(df_images_families3, df_images_families3['Specie'],
stratify=df_images_families3['Specie'],
test_size=0.2, random_state=42)
train['is_valid'] = False
test['is_valid'] = True
<ipython-input-50-5c5634737421>:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy <ipython-input-50-5c5634737421>:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
df_families3_stratified = pd.concat([train, test])
df_families3_stratified.to_csv(STRATIFIED_SAMPLE_CSV_PATH, index=False)
df_families3_stratified = pd.read_csv(STRATIFIED_SAMPLE_CSV_PATH)
df_families3_stratified
| Order | Family | Subfamily | Genus | Specie | fname | is_valid | |
|---|---|---|---|---|---|---|---|
| 0 | Perciformes | Centrarchidae | NaN | Ambloplites | Ambloplites rupestris | Ambloplites rupestris/25240275.jpg | False |
| 1 | Perciformes | Centrarchidae | NaN | Micropterus | Micropterus salmoides | Micropterus salmoides/65295609.jpg | False |
| 2 | Perciformes | Centrarchidae | NaN | Micropterus | Micropterus salmoides | Micropterus salmoides/47839127.jpg | False |
| 3 | Perciformes | Pomacentridae | Pomacentrinae | Abudefduf | Abudefduf abdominalis | Abudefduf abdominalis/14442685.jpg | False |
| 4 | Perciformes | Pomacentridae | Pomacentrinae | Chromis | Chromis multilineata | Chromis multilineata/49300790.jpg | False |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 86383 | Perciformes | Pomacentridae | Pomacentrinae | Mecaenichthys | Mecaenichthys immaculatus | Mecaenichthys immaculatus/26249475.jpg | True |
| 86384 | Cypriniformes | Cyprinidae | Leuciscinae | Chrosomus | Chrosomus eos | Chrosomus eos/89045981.jpg | True |
| 86385 | Perciformes | Pomacentridae | Pomacentrinae | Pomacentrus | Pomacentrus coelestis | Pomacentrus coelestis/5496001.jpg | True |
| 86386 | Cypriniformes | Cyprinidae | Cyprininae | Cyprinus | Cyprinus carpio | Cyprinus carpio/41270267.jpg | True |
| 86387 | Perciformes | Centrarchidae | NaN | Lepomis | Lepomis cyanellus | Lepomis cyanellus/91535553.jpg | True |
86388 rows × 7 columns
species_count = df_families3_stratified.groupby('Specie')['Specie'].count()
species, count_values = species_count.keys(), species_count.values
weights = compute_class_weight('balanced', classes=species, y=df_families3_stratified['Specie'])
weights_df = pd.DataFrame.from_dict({'Specie': species, 'Count': count_values, 'Weight': weights})
weights_df
| Specie | Count | Weight | |
|---|---|---|---|
| 0 | Abramis brama | 300 | 1.999722 |
| 1 | Abudefduf abdominalis | 177 | 3.389360 |
| 2 | Abudefduf bengalensis | 299 | 2.006410 |
| 3 | Abudefduf saxatilis | 1423 | 0.421586 |
| 4 | Abudefduf septemfasciatus | 128 | 4.686849 |
| ... | ... | ... | ... |
| 139 | Stegastes partitus | 174 | 3.447797 |
| 140 | Stegastes planifrons | 178 | 3.370318 |
| 141 | Systomus rubripinnis | 587 | 1.022005 |
| 142 | Telestes muticellus | 144 | 4.166088 |
| 143 | Tinca tinca | 246 | 2.438686 |
144 rows × 3 columns
weights_df.to_csv(WEIGHTS_SAMPLE_CSV_PATH, index=False)
families_count = df_families3_stratified.groupby('Family')['Family'].count()
families, count_values = families_count.keys(), families_count.values
weights = compute_class_weight('balanced', classes=families, y=df_families3_stratified['Family'])
weights_df = pd.DataFrame.from_dict({'Family': families, 'Count': count_values, 'Weight': weights})
weights_df
| Family | Count | Weight | |
|---|---|---|---|
| 0 | Centrarchidae | 38554 | 0.746900 |
| 1 | Cyprinidae | 28196 | 1.021280 |
| 2 | Pomacentridae | 19638 | 1.466341 |
weights_df.to_csv(WEIGHTS_SAMPLE_FAMILIES_CSV_PATH, index=False)
genus_count = df_families3_stratified.groupby('Genus')['Genus'].count()
genus, count_values = genus_count.keys(), genus_count.values
weights = compute_class_weight('balanced', classes=genus, y=df_families3_stratified['Genus'])
weights_df = pd.DataFrame.from_dict({'Genus': genus, 'Count': count_values, 'Weight': weights})
weights_df
| Genus | Count | Weight | |
|---|---|---|---|
| 0 | Abramis | 300 | 5.142143 |
| 1 | Abudefduf | 4924 | 0.313291 |
| 2 | Acanthochromis | 107 | 14.417223 |
| 3 | Acrossocheilus | 454 | 3.397892 |
| 4 | Alburnus | 252 | 6.121599 |
| 5 | Ambloplites | 1448 | 1.065361 |
| 6 | Amblyglyphidodon | 506 | 3.048701 |
| 7 | Amphiprion | 3102 | 0.497306 |
| 8 | Barbus | 120 | 12.855357 |
| 9 | Campostoma | 725 | 2.127783 |
| 10 | Candidia | 122 | 12.644614 |
| 11 | Carassius | 1502 | 1.027059 |
| 12 | Centrarchus | 282 | 5.470365 |
| 13 | Chromis | 2263 | 0.681680 |
| 14 | Chrosomus | 689 | 2.238959 |
| 15 | Chrysiptera | 206 | 7.488558 |
| 16 | Clinostomus | 229 | 6.736432 |
| 17 | Ctenopharyngodon | 494 | 3.122759 |
| 18 | Cyprinella | 1218 | 1.266538 |
| 19 | Cyprinus | 6954 | 0.221835 |
| 20 | Dascyllus | 1593 | 0.968388 |
| 21 | Enneacanthus | 109 | 14.152687 |
| 22 | Gobio | 157 | 9.825751 |
| 23 | Hesperoleucus | 74 | 20.846525 |
| 24 | Hypophthalmichthys | 106 | 14.553235 |
| 25 | Hypsypops | 676 | 2.282016 |
| 26 | Lepomis | 22205 | 0.069473 |
| 27 | Luxilus | 1411 | 1.093298 |
| 28 | Mecaenichthys | 360 | 4.285119 |
| 29 | Micropterus | 12481 | 0.123599 |
| 30 | Microspathodon | 903 | 1.708353 |
| 31 | Neoglyphidodon | 245 | 6.296501 |
| 32 | Neopomacentrus | 111 | 13.897683 |
| 33 | Nocomis | 992 | 1.555084 |
| 34 | Notemigonus | 925 | 1.667722 |
| 35 | Notropis | 816 | 1.890494 |
| 36 | Opsariichthys | 526 | 2.932781 |
| 37 | Parazacco | 210 | 7.345918 |
| 38 | Parma | 1376 | 1.121107 |
| 39 | Phoxinus | 374 | 4.124714 |
| 40 | Pimephales | 1509 | 1.022295 |
| 41 | Plectroglyphidodon | 312 | 4.944368 |
| 42 | Pomacentrus | 1141 | 1.352010 |
| 43 | Pomoxis | 2029 | 0.760297 |
| 44 | Premnas | 274 | 5.630083 |
| 45 | Pseudorasbora | 174 | 8.865764 |
| 46 | Ptychocheilus | 144 | 10.712798 |
| 47 | Rhinichthys | 1693 | 0.911189 |
| 48 | Rutilus | 541 | 2.851466 |
| 49 | Scardinius | 569 | 2.711147 |
| 50 | Semotilus | 2892 | 0.533417 |
| 51 | Squalius | 1047 | 1.473393 |
| 52 | Stegastes | 1539 | 1.002367 |
| 53 | Systomus | 587 | 2.628012 |
| 54 | Telestes | 144 | 10.712798 |
| 55 | Tinca | 246 | 6.270906 |
weights_df.to_csv(WEIGHTS_SAMPLE_GENUS_CSV_PATH, index=False)
order_count = df_families3_stratified.groupby('Order')['Order'].count()
order, count_values = order_count.keys(), order_count.values
weights = compute_class_weight('balanced', classes=order, y=df_families3_stratified['Order'])
weights_df = pd.DataFrame.from_dict({'Order': order, 'Count': count_values, 'Weight': weights})
weights_df
| Order | Count | Weight | |
|---|---|---|---|
| 0 | Cypriniformes | 28196 | 1.531919 |
| 1 | Perciformes | 58192 | 0.742267 |
weights_df.to_csv(WEIGHTS_SAMPLE_ORDER_CSV_PATH, index=False)